Here, we will transfer the label and UMAP coordinates from the discovery cohort to the validation cohort. We will do it one compartment at a time (e.g. pc), so we can validate that the transfer is robust
Specifically, we will follow these steps for every compartment:
library(Seurat)
library(tidyverse)
library(caret)
library(class)
library(harmony)
library(here)
library(glue)
library(DT)
library(pheatmap2)
source(here("scRNA-seq/bin/SLOcatoR_functions.R"))
source(here("scRNA-seq/bin/utils.R"))
source(here("scRNA-seq/bin/utils_final_clusters.R"))
color_palette <- c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", "#5A5156", "#AA0DFE",
"#F8A19F", "#F7E1A0", "#1C8356", "#FEAF16", "#822E1C", "#C4451C",
"#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", "#FBE426", "#16FF32",
"black", "#3283FE", "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
"#90AD1C", "#FE00FA", "#85660D", "#3B00FB", "#822E1C", "coral2",
"#1CFFCE", "#1CBE4F", "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
"#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD", "#FEAF16", "#683B79",
"#B10DA1", "#1C7F93", "#F8A19F", "dark orange", "#FEAF16", "#FBE426",
"Brown")
# Read data, clean and include annotation
pc <- readRDS(here("scRNA-seq/results/R_objects/seurat_objects_revision/4.9-PC_seurat_object_discovery_validation_cohorts.rds"))
pc <- updateAnnotation(pc)
pc$annotation_20230508 <- pc$annotation_20220619
pc$annotation_20230508[is.na(pc$annotation_20230508)] <- "unannotated"
pc$annotation_20230508 <- factor(
pc$annotation_20230508,
levels = c(names(colors_20230508[["PC"]]), "unannotated")
)
Idents(pc) <- "annotation_20230508"
pc$is_reference <- ifelse(
pc$cohort_type == "discovery" & pc$assay == "3P",
"reference",
"query"
)
DimPlot(
pc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = c(colors_20230508[["PC"]], "unannotated" = "black")
) &
theme(legend.text = element_text(size = 6))
We stil see some clusters that did not integrate well. Let us inspect the doublet scores and markers of other populations:
FeaturePlot(pc, c("CD3D", "LYZ")) & scale_color_viridis_c(option = "magma")
FeaturePlot(pc[, pc$cohort_type == "validation"], "doublet_score_scDblFinder") &
scale_color_viridis_c(option = "magma")
FeaturePlot(pc, c("S.Score", "G2M.Score")) & scale_color_viridis_c(option = "magma")
We see a clear cluster of T-cell doublets. Let us exclude them from the dataset:
pc <- FindNeighbors(pc, reduction = "harmony", dims = 1:25)
pc <- FindClusters(pc, resolution = 0.3)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 14913
## Number of edges: 603207
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9163
## Number of communities: 11
## Elapsed time: 3 seconds
DimPlot(pc, cols = color_palette)
markers6 <- FindMarkers(pc, ident.1 = "6", only.pos = TRUE, logfc.threshold = 0.6)
DT::datatable(markers6)
pc <- pc[, !(pc$cohort_type == "validation" & pc$seurat_clusters == "6")]
DimPlot(pc)
In addition, we see that cells from other compartments (GCBC, MBC) were included to map trajectories. For clarity, we will exclude them from the analysis:
exclude <- c("Dark Zone GCBC", "Light Zone GCBC")
pc <- pc[, !(pc$annotation_20230508 %in% exclude)]
pc$annotation_20230508 <- droplevels(pc$annotation_20230508)
colors_20230508$PC <- colors_20230508$PC[!(names(colors_20230508$PC) %in% exclude)]
Finally, let’s explore how doublet scores distribute per cluster, and remove any cluster with a high doublet score:
VlnPlot(pc[, pc$cohort_type == "validation"], features = "doublet_score_scDblFinder", pt.size = 0)
Reintegrate:
pc$type <- str_c(pc$cohort_type, pc$assay, sep = "_")
shared_hvg <- find_assay_specific_features(pc, assay_var = "type")
pc <- integrate_assays(
pc,
assay_var = "type",
shared_hvg = shared_hvg
)
pc <- RunUMAP(pc, dims = 1:25, reduction = "harmony")
DimPlot(
pc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = c(colors_20230508[["PC"]], "unannotated" = "gray")
) &
theme(legend.text = element_text(size = 6))
Transfer label:
# Split training and test sets
Idents(pc) <- "annotation_20230508"
data_sets <- split_training_and_test_sets(
pc,
split_var = "cohort_type",
referece_label = "discovery",
query_label = "validation",
reduction = "harmony",
n_dims = 25
)
# Transfer label
annotation_query_df <- transfer_label(
seurat_obj = pc,
training_set = data_sets$training_set,
test_set = data_sets$test_set,
k = 8,
response_var = "annotation_20230508"
)
# Transfer UMAP coords
# umap_test_df <- transfer_umap_coords(
# seurat_obj = pc,
# training_set = data_sets$training_set,
# test_set = data_sets$test_set,
# umap1_var = "UMAP_1_20220215",
# umap2_var = "UMAP_2_20220215",
# k = 15
# )
# Include annotation and UMAP coords in Seurat object
pc$annotation_20230508[annotation_query_df$query_cells] <- annotation_query_df$annotation
Idents(pc) <- "annotation_20230508"
pc$annotation_20230508_probability <- NA
pc$annotation_20230508_probability[annotation_query_df$query_cells] <- annotation_query_df$annotation_prob
# pc$UMAP_1_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP1
# pc$UMAP_2_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP2
p1 <- FeaturePlot(
pc[, annotation_query_df$query_cells],
"annotation_20230508_probability"
) + scale_color_viridis_c(option = "magma")
p2 <- FeaturePlot(
pc[, annotation_query_df$query_cells],
"doublet_score_scDblFinder"
) + scale_color_viridis_c(option = "magma")
p1 | p2
DimPlot(
pc,
group.by = "annotation_20230508",
split.by = "cohort_type",
cols = colors_20230508[["PC"]]
) &
theme(legend.text = element_text(size = 6))
table(pc$annotation_20230508_probability)
##
## 0.222222222222222 0.25 0.333333333333333 0.375 0.444444444444444 0.5 0.625 0.666666666666667 0.75 0.777777777777778 0.875 0.888888888888889 1
## 1 69 1 426 3 824 900 7 803 2 852 5 1288
FeaturePlot(
pc[, pc$cohort_type == "validation"],
c("annotation_20230508_probability", "doublet_score_scDblFinder")
) & scale_color_viridis_c(option = "magma")
There is a cluster with a high doublet score and a low annotation probability. We will flag it for later discussion.
Plot heatmap:
goi <- c("SUGCT", "AICDA", "CXCR4", "LMO2", "CD83", "BCL2A1", "BCL6", "IRF8",
"MEF2B", "MS4A1", "PAX5", "PRDM1", "XBP1", "IRF4", "SLAMF7", "SSR4",
"MZB1", "DERL3", "CREB3L2", "FKBP11", "IGHG1", "IGHG2", "IGHG3", "IGHG4",
"IGHA1", "IGHA2", "IGHM", "IGHD", "CCDC50", "FCMR", "CD9", "CD44", "H2AFZ",
"MKI67", "TUBA1B", "TOP2A")
avgexpr_mat <- AverageExpression(
pc[, pc$cohort_type == "validation"],
features = goi,
assays = "RNA",
return.seurat = FALSE,
group.by = "annotation_20230508",
slot = "data"
)$RNA
# Define annotation column
cell_types <- levels(pc$annotation_20230508)
mycolors <- list(cell_type = c(colors_20230508$PC))
annotation_col <- data.frame(cell_type = cell_types)
rownames(annotation_col) <- cell_types
# Scale per row between 0 and 1
input_mat <- t(apply(avgexpr_mat, 1, function(x) (x - min(x)) / diff(range(x))))
# Plot heatmap
pheatmap2(
input_mat,
annotation_col = annotation_col,
annotation_colors = mycolors,
annotation_names_col = TRUE,
annotation_legend = TRUE,
show_rownames = TRUE,
show_colnames = FALSE,
border_color = NA,
cluster_rows = FALSE,
cluster_cols = FALSE,
fontsize_row = 6
)
Plot annotation probability:
pc@meta.data %>%
filter(cohort_type == "validation") %>%
ggplot(aes(annotation_20230508, annotation_20230508_probability, fill = annotation_20230508)) +
geom_boxplot() +
scale_fill_manual(values = colors_20230508$PC) +
theme_classic() +
theme(legend.position = "none") +
coord_flip()
saveRDS(pc, here("scRNA-seq/results/R_objects/seurat_objects_revision/5.9-pc_seurat_object_discovery_validation_cohorts_annotation.rds"))
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=es_ES.UTF-8 LC_COLLATE=en_US.UTF-8 LC_MONETARY=es_ES.UTF-8 LC_MESSAGES=en_US.UTF-8 LC_PAPER=es_ES.UTF-8 LC_NAME=C LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] grid stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] pheatmap2_0.1.0 fastcluster_1.2.3 parallelDist_0.2.6 gtable_0.3.1 scales_1.2.1 RColorBrewer_1.1-3 DT_0.27 glue_1.6.2 here_1.0.1 harmony_0.1.1 Rcpp_1.0.10 class_7.3-21 caret_6.0-93 lattice_0.20-45 forcats_1.0.0 stringr_1.5.0 dplyr_1.1.0 purrr_1.0.1 readr_2.1.3 tidyr_1.3.0 tibble_3.1.8 ggplot2_3.4.0 tidyverse_1.3.2 SeuratObject_4.1.3 Seurat_4.3.0 BiocStyle_2.26.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.3 spatstat.explore_3.0-6 reticulate_1.28 tidyselect_1.2.0 htmlwidgets_1.6.1 Rtsne_0.16 pROC_1.18.0 munsell_0.5.0 codetools_0.2-19 ica_1.0-3 future_1.31.0 miniUI_0.1.1.1 withr_2.5.0 spatstat.random_3.1-3 colorspace_2.1-0 progressr_0.13.0 highr_0.10 knitr_1.42 rstudioapi_0.14 stats4_4.2.0 ROCR_1.0-11 tensor_1.5 listenv_0.9.0 labeling_0.4.2 polyclip_1.10-4 farver_2.1.1 rprojroot_2.0.3 parallelly_1.34.0 vctrs_0.5.2 generics_0.1.3 ipred_0.9-13 xfun_0.37 timechange_0.2.0 R6_2.5.1 ggbeeswarm_0.7.1 spatstat.utils_3.0-1 cachem_1.0.6 promises_1.2.0.1 nnet_7.3-18 googlesheets4_1.0.1 beeswarm_0.4.0 globals_0.16.2 goftest_1.2-3 timeDate_4022.108 rlang_1.0.6 splines_4.2.0 lazyeval_0.2.2 ModelMetrics_1.2.2.2 gargle_1.3.0 spatstat.geom_3.0-6 broom_1.0.3
## [52] BiocManager_1.30.19 yaml_2.3.7 reshape2_1.4.4 abind_1.4-5 modelr_0.1.10 crosstalk_1.2.0 backports_1.4.1 httpuv_1.6.8 tools_4.2.0 lava_1.7.1 bookdown_0.32 ellipsis_0.3.2 jquerylib_0.1.4 ggridges_0.5.4 plyr_1.8.8 rpart_4.1.19 deldir_1.0-6 pbapply_1.7-0 cowplot_1.1.1 zoo_1.8-11 haven_2.5.1 ggrepel_0.9.3 cluster_2.1.4 fs_1.6.0 magrittr_2.0.3 data.table_1.14.6 scattermore_0.8 lmtest_0.9-40 reprex_2.0.2 RANN_2.6.1 googledrive_2.0.0 fitdistrplus_1.1-8 matrixStats_0.63.0 hms_1.1.2 patchwork_1.1.2 mime_0.12 evaluate_0.20 xtable_1.8-4 readxl_1.4.1 gridExtra_2.3 compiler_4.2.0 KernSmooth_2.23-20 crayon_1.5.2 htmltools_0.5.4 later_1.3.0 tzdb_0.3.0 RcppParallel_5.1.6 lubridate_1.9.1 DBI_1.1.3 dbplyr_2.3.2 MASS_7.3-58.2
## [103] Matrix_1.5-3 cli_3.6.0 parallel_4.2.0 gower_1.0.1 igraph_1.3.5 pkgconfig_2.0.3 sp_1.6-0 plotly_4.10.1 spatstat.sparse_3.0-0 recipes_1.0.4 xml2_1.3.3 foreach_1.5.2 vipor_0.4.5 bslib_0.4.2 hardhat_1.2.0 prodlim_2019.11.13 rvest_1.0.3 digest_0.6.31 sctransform_0.3.5 RcppAnnoy_0.0.20 spatstat.data_3.0-0 rmarkdown_2.20 cellranger_1.1.0 leiden_0.4.3 uwot_0.1.14 shiny_1.7.4 lifecycle_1.0.3 nlme_3.1-162 jsonlite_1.8.4 limma_3.54.1 viridisLite_0.4.1 fansi_1.0.4 pillar_1.8.1 ggrastr_1.0.1 fastmap_1.1.0 httr_1.4.4 survival_3.5-0 png_0.1-8 iterators_1.0.14 stringi_1.7.12 sass_0.4.5 irlba_2.3.5.1 future.apply_1.10.0